********************************************************************************
* clean_worker.do
* Purpose: Build the worker-level annual datasets for propensity-score matching
*          and four auxiliary firm/market-level datasets.
*
* Outputs:
*   worker_YYYY.dta      - Worker x firm x year data with all matching variables
*                          (one file per year, 2001-2017)
*   firm_year_cz.dta     - Firm x year count of distinct commuting zones (CZs)
*   same_market.dta      - Deal-level indicator: acquirer & target share a market
*   same_market_firm.dta - Deal x market-level indicator of market overlap
*   treated_market.dta   - CZ x industry x year panel marking "treated" markets
*
* Market definition throughout: CZ (commuting zone, sac_syn) x NAICS industry
*
* Inputs: T4ROE_YYYY, T1_YYYY, worker_firm_panel, moonlighter, match_effect,
*         akm, firm_YYYY, canada_cpi, t4roe_add_YYYY (province), t1_geo_YYYY (CZ)
********************************************************************************
set more off

//------------------------------------------------------------------------------
// SECTION 1: WORKER-LEVEL ANNUAL DATASETS (worker_YYYY.dta)
//
// For each year, starts from T4ROE (employer-employee earnings records) and
// enriches each worker-firm observation with:
//   - 2-digit NAICS sector
//   - Moonlighting indicator (from moonlighter.dta)
//   - Tenure (years since first job at this firm)
//   - Worker demographics: age, sex, UI earnings (from T1)
//   - Job province (from t4roe_add)
//   - Worker commuting zone (from t1_geo)
//   - Worker-firm match effect (from match_effect.dta)
//   - Firm AKM fixed effect (from akm.dta)
//   - Firm-level NALMF characteristics (revenue, employment, payroll, province)
//   - CPI deflator (base year 2011)
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data1/T4ROE_`y' , replace

	//------------------------------------------------------------------
	// 2-digit NAICS sector (same grouping logic as in clean_firm.do)
	//------------------------------------------------------------------
	destring naics, replace
	gen 	naics2 		= int(naics/100)
	replace naics2  	= 31 if naics2 == 32 | naics2 == 33
	replace naics2  	= 44 if naics2 == 45
	replace naics2  	= 48 if naics2 == 49
	replace naics2 		= 54 if naics2 == 56 | naics2 == 61	| naics2 ==62

	//------------------------------------------------------------------
	// Moonlighting indicator: merge from moonlighter.dta
	// moonlighter == 1 if the worker held multiple jobs in any year
	//------------------------------------------------------------------
	merge m:1 casenum2019 using $data/moonlighter, keep(1 3)
	gen 	moonlighter = 1 if _merge == 3
	replace moonlighter = 0 if _merge == 1
	drop _merge

	//------------------------------------------------------------------
	// Tenure: merge from worker_firm_panel for spell start/end years
	// tenure = current year - first_year_at_firm (years at this firm)
	//------------------------------------------------------------------
	merge m:1 casenum2019 entid_syn year using $data/worker_firm_panel, keepusing(first_year_at_firm last_year_at_firm max_gap) keep(1 3) nogen
	gen 	tenure = year - first_year_at_firm

	//------------------------------------------------------------------
	// Worker demographics from individual income tax return (T1):
	//   t1_age_recorded  - Worker age
	//   t1_sex_recorded  - Worker sex (1=Male, 2=Female)
	//   t1_uiearn        - UI-insurable earnings
	//------------------------------------------------------------------
	merge m:1 casenum2019 			using $data1/T1_`y', keepusing(t1_age_recorded t1_sex_recorded t1_uiearn) keep(1 3) nogen

	* Worker's commuting zone (SAC code from T1 geographic file)
	merge m:1 casenum2019 			using $data3/t1_geo_`y', keepusing(SAC_syn) keep(1 3) nogen

	//------------------------------------------------------------------
	// Worker-firm match effect (logearnings_met) from Woodcock decomposition
	// Firm employer fixed effect (fe) from AKM estimation
	//------------------------------------------------------------------
	merge m:1 casenum2019 entid_syn using $data/match_effect, keepusing(logearnings_met) keep(1 3) nogen
	merge m:1 entid_syn 			using $data/akm, keepusing(fe) keep(1 3) nogen

	//------------------------------------------------------------------
	// Firm-level NALMF characteristics (revenue, payroll, employment, province)
	//------------------------------------------------------------------
	merge m:1 entid_syn 			using $data/firm_`y', keepusing(BirthDate T4_Payroll T4_ILU PD7_AvgEmp_NonZero total_revenue total_expense OPAddressProvince avg_payrolls UltimateParentID_syn) keep(1 3) nogen

	//------------------------------------------------------------------
	// CPI deflator: base year 2011 (CPI_2011 = 119.9)
	// CPI_base_2011 converts nominal values to 2011 real dollars
	//------------------------------------------------------------------
	merge m:1 year 					using $data/canada_cpi, keepusing(CPI) keep(1 3) nogen
	gen CPI_base_2011 = CPI/119.9

	* Convert SAC (commuting zone) from string to numeric
	destring SAC_syn, g(sac_syn)
	drop SAC_syn

	compress
	save $data/worker_`y', replace
}


//------------------------------------------------------------------------------
// SECTION 2: FIRM-LEVEL CZ COUNT (firm_year_cz.dta)
//
// Counts the number of distinct commuting zones (CZs) in which each firm
// has at least one employed worker in each year.
// Used in analyze_hetero.do to define heterogeneity variables.
//------------------------------------------------------------------------------
clear
forvalues y = 2001/2017 {

	use $data/worker_`y', clear

	* Restrict to observations with valid earnings and known CZ
	drop if mi(t4earn) | t4earn == 0
	drop if mi(sac_syn)

	* Count distinct CZs per firm-year
	gegen num_cz = nunique(sac_syn), by(entid_syn year)

	* Collapse to firm-year level
	duplicates drop entid_syn year, force
	keep entid_syn year num_cz

	save	$data/firm_year_cz_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/firm_year_cz_`y',  force
	erase $data/firm_year_cz_`y'.dta
}

compress
save $data/firm_year_cz, replace


//------------------------------------------------------------------------------
// SECTION 3: DEALS WITH OVERLAPPING MARKETS (same_market.dta)
//
// For each deal, determines whether the acquirer and target operate in
// at least one common market (defined as CZ x NAICS industry).
// same_market == 1 means horizontal overlap: both firms are in the same market.
// Collapsed to deal level (one indicator per DEAL).
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data/worker_`y', clear
	drop if mi(sac_syn) | mi(naics) | mi(t4earn) | t4earn == 0

	* Collapse to firm x market (CZ x NAICS) x year: count workers as market size
	collapse (count) total_emp_estab = t4earn, by(entid_syn sac_syn naics year)

	* Join with M&A deal data to link each firm-market to its deal
	gen DEAL_YEAR = year
	joinby entid_syn DEAL_YEAR using $data/all_mna

	* Within each deal x market cell, check whether both parties (acquirer and target)
	* are present. If the previous firm ID (sorted: Acquirer first) differs, they share that market.
	gsort DEAL sac_syn naics -Acquirer
	by DEAL sac_syn naics: gen new_firm_id2 = entid_syn[_n-1] if entid_syn ~= entid_syn[_n-1]
	gegen	temp2				= 	mode(new_firm_id2), by(DEAL sac_syn naics)
	gen		same_market_tmp		=	~mi(temp2)              // 1 if both parties share this market

	* same_market at the deal level: 1 if ANY market is shared by both parties
	gegen 	same_market			=	max(same_market_tmp), by(DEAL)

	* Collapse to deal x party level
	collapse (firstnm) same_market year, by(DEAL Acquirer)

	save $data/same_market_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	dis( "`y'")
	append using $data/same_market_`y', force
	erase $data/same_market_`y'.dta
}

compress
save $data/same_market, replace


//------------------------------------------------------------------------------
// SECTION 4: FIRMS WITH OVERLAPPING MARKETS (same_market_firm.dta)
//
// Similar to Section 3, but keeps the market-level grain (DEAL x CZ x NAICS)
// rather than collapsing to the deal level.
// Used to identify market-level treatment for heterogeneity analysis.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data\worker_`y'.dta, clear
	drop if mi(sac_syn) | mi(naics) | mi(t4earn) | t4earn == 0

	* Collapse to firm x market x year
	collapse (count) total_emp_estab = t4earn, by(entid_syn sac_syn naics year)
	gen DEAL_YEAR = year
	joinby entid_syn DEAL_YEAR using $data/all_mna

	* Check whether both deal parties are present in each specific market
	gsort DEAL sac_syn naics -Acquirer
	by DEAL sac_syn naics: gen new_firm_id2 = entid_syn[_n-1] if entid_syn ~= entid_syn[_n-1]
	gegen	temp2				= 	mode(new_firm_id2), by(DEAL sac_syn naics)
	gen		same_market			=	~mi(temp2)

	* Retain market-level detail (not collapsed to deal level)
	collapse (firstnm) same_market year, by(DEAL Acquirer sac_syn naics)

	save $data/same_market_firms_`y', replace
}

drop _all
forvalues y = 2001/2017 {
	append using $data/same_market_firms_`y'.dta, force
	erase $data/same_market_firms_`y'.dta
}

compress
save $data/same_market_firm, replace


//------------------------------------------------------------------------------
// SECTION 5: TREATED MARKETS (treated_market.dta)
//
// For each CZ x NAICS x year cell, flags whether the market was "treated"
// (i.e., contained at least one M&A firm in that year).
// A market is treated if at least one worker in that market-year works at
// a firm that is listed in all_mna for that year.
// Used in concentration analysis and robustness regressions.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data/worker_`y', replace
	drop if mi(sac_syn) | mi(naics) | mi(t4earn) | t4earn == 0

	* Match workers' firm to M&A deal list for the same year
	gen DEAL_YEAR = year
	joinby entid_syn DEAL_YEAR using $data/all_mna, unmatched(master)

	* treated == 1 if the worker's firm is an M&A firm in this market-year
	gen 	treated 	= ~mi(DEAL)

	* Collapse to market level: treated == 1 if ANY firm in that market had an M&A
	collapse (max) treated, by(naics sac_syn year)

	compress
	save $data/treated_market_`y', replace
}

clear
set obs 0
forvalues y = 2001/2017 {
	append using $data/treated_market_`y', force
	erase $data/treated_market_`y'.dta
}

gsort sac_syn naics year
compress
save $data/treated_market, replace
